library(feather)
library(foreach)
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:plyr':
##
## here
## The following object is masked from 'package:base':
##
## date
library(imputeTS)
## Registered S3 method overwritten by 'xts':
## method from
## as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
## method from
## fitted.fracdiff fracdiff
## residuals.fracdiff fracdiff
library(tictoc)
input_folder = "../../Seasonality-Restricted-Access-Repo/Data/Clue_subset/input/"
input_folder = "../../Seasonality-Restricted-Access-Repo/Data/Clue/input/"
tmp_folder = "../../Seasonality-Restricted-Access-Repo/Data/Clue/tmp/"
users = read_feather(path = paste0(input_folder, "users.feather"))
bc = read_feather(path = paste0(input_folder, "birth_control.feather"))
ggplot(users[users$country_area %in% names(sort(table(users$country_area),decreasing = TRUE))[1:10],],
aes(x = latest_birth_control, fill = country_area))+
geom_bar()+
facet_grid(country_area ~ .)

countries = c("Brazil - Central-West","Brazil - Northeast","United Kingdom","France","Germany","Mexico","United States - Midwest","United States - California","Canada","Australia")
birth_controls = c("pill","none","condoms")
j = which((users$country_area %in% countries) & (users$latest_birth_control %in% birth_controls))
ggplot(users[j,],
aes(x = latest_birth_control, fill = country_area))+
geom_bar(position = "dodge")

users = users[j,]
tracking_files = list.files(paste0(input_folder,"tracking/"))
tmp_folder_for_tracking_with_BC_and_country = paste0(tmp_folder,"tracking_with_BC_and_country/")
if(!dir.exists(tmp_folder_for_tracking_with_BC_and_country)){dir.create(tmp_folder_for_tracking_with_BC_and_country)}
tmp_folder_for_tracking_selected_users = paste0(tmp_folder,"tracking_selected_users/")
if(!dir.exists(tmp_folder_for_tracking_selected_users)){dir.create(tmp_folder_for_tracking_selected_users)}
tic()
sex_pop_time_series = foreach(tracking_file = tracking_files, .combine = rbind) %do% {
cat(tracking_file,"\n")
tracking = read_feather(path = paste0(input_folder,"tracking/",tracking_file))
m = match(tracking$user_id, users$user_id)
tracking$country_area = users$country_area[m]
tracking$latest_birth_control = users$latest_birth_control[m]
tracking$file = tracking_file
write_feather(tracking, path = paste0(tmp_folder_for_tracking_with_BC_and_country, tracking_file) )
j = which((tracking$user_id %in% users$user_id) &
(tracking$category %in% c("ANY","sex")))
tracking = tracking[j,]
write_feather(tracking, path = paste0(tmp_folder_for_tracking_selected_users, tracking_file) )
sex_pop_time_series =
ddply(tracking,
.(date,country_area,latest_birth_control,category,type),
.fun = summarise,
n = length(unique(user_id)))
return(sex_pop_time_series)
}
## tracking0000.feather
## tracking0001.feather
## tracking0002.feather
## tracking0003.feather
## tracking0004.feather
## tracking0005.feather
## tracking0006.feather
## tracking0007.feather
## tracking0008.feather
## tracking0009.feather
## tracking0010.feather
## tracking0011.feather
## tracking0012.feather
## tracking0013.feather
## tracking0014.feather
## tracking0015.feather
## tracking0016.feather
## tracking0017.feather
## tracking0018.feather
## tracking0019.feather
toc()
## 3209.89 sec elapsed
dim(sex_pop_time_series)
## [1] 2706381 6
write_feather(sex_pop_time_series, path = paste0(tmp_folder, "sex_pop_time_series.feather"))
ggplot(sex_pop_time_series, aes(x = date, y = n, col = type))+
geom_line()+
facet_grid(country_area ~ latest_birth_control)

sex_pop_ts = sex_pop_time_series[sex_pop_time_series$category == "sex",]
sex_pop_ts$key = paste0(sex_pop_ts$date, sex_pop_ts$country_area, sex_pop_ts$latest_birth_control)
any_pop_ts = sex_pop_time_series[sex_pop_time_series$category == "ANY",]
any_pop_ts$key = paste0(any_pop_ts$date, any_pop_ts$country_area, any_pop_ts$latest_birth_control)
sex_pop_ts$any = any_pop_ts$n[match(sex_pop_ts$key, any_pop_ts$key)]
sex_pop_ts$norm_n = sex_pop_ts$n / sex_pop_ts$any
ggplot(sex_pop_ts, aes(x = date, y = norm_n, col = type))+
geom_line()+
facet_grid(country_area ~ latest_birth_control)

j = which((sex_pop_ts$date > "2017-06-30") )
ggplot(sex_pop_ts[j,], aes(x = date, y = norm_n, col = type))+
geom_line()+
facet_grid(latest_birth_control ~ country_area)

j = which((sex_pop_ts$date > "2017-06-30") &
(sex_pop_ts$type %in% c("unprotected_sex","protected_sex")))
ggplot(sex_pop_ts[j,], aes(x = date, y = norm_n, col = latest_birth_control))+
geom_line()+
facet_grid(type ~ country_area)

sex_pop_ts_sum = aggregate(n ~ date + country_area + latest_birth_control + key, sex_pop_ts, sum)
sex_pop_ts_sum$any = any_pop_ts$n[match(sex_pop_ts_sum$key, any_pop_ts$key)]
sex_pop_ts_sum$norm_n = sex_pop_ts_sum$n / sex_pop_ts_sum$any
j = which((sex_pop_ts_sum$date > "2017-06-30") & (sex_pop_ts_sum$latest_birth_control %in% c("condoms","none","pill")))
ggplot(sex_pop_ts_sum[j,], aes(x = date, y = norm_n))+
geom_line()+
facet_grid(latest_birth_control ~ country_area)

sex_pop_ts_sum$weekday = wday(sex_pop_ts_sum$date)
sex_pop_ts_sum_wd = ddply(sex_pop_ts_sum,
.(weekday, country_area, latest_birth_control),
summarise,
n = sum(n),
any = sum(any))
sex_pop_ts_sum_wd$norm_n = sex_pop_ts_sum_wd$n / sex_pop_ts_sum_wd$any
j = which(sex_pop_ts_sum_wd$latest_birth_control %in% c("condoms","none","pill"))
ggplot(sex_pop_ts_sum_wd[j,], aes(x = weekday, y = norm_n, col = latest_birth_control) )+
geom_line()+
facet_grid(country_area ~ ., scale = "free")

sex_pop_ts_sum$month = month(sex_pop_ts_sum$date)
sex_pop_ts_sum_m = ddply(sex_pop_ts_sum,
.(month, country_area, latest_birth_control),
summarise,
n = sum(n),
any = sum(any))
sex_pop_ts_sum_m$norm_n = sex_pop_ts_sum_m$n / sex_pop_ts_sum_m$any
j = which(sex_pop_ts_sum_m$latest_birth_control %in% c("condoms","none","pill"))
ggplot(sex_pop_ts_sum_m[j,], aes(x = month, y = norm_n, col = latest_birth_control) )+
geom_line()+
facet_grid(country_area ~ ., scale = "free")+
scale_x_continuous(breaks = 1:12)

j = which(sex_pop_ts_sum_m$latest_birth_control %in% c("condoms","none"))
ggplot(sex_pop_ts_sum_m[j,], aes(x = month, y = norm_n, col = latest_birth_control) )+
geom_line()+
facet_grid(country_area ~ ., scale = "free")+
scale_x_continuous(breaks = 1:12)

sex_pop_ts_sum_m$country_area = factor(sex_pop_ts_sum_m$country_area, levels = countries)
j = which(sex_pop_ts_sum_m$latest_birth_control %in% c("condoms"))
ggplot(sex_pop_ts_sum_m[j,], aes(x = month, y = norm_n, col = country_area) )+
geom_line()+
facet_grid(country_area ~ ., scale = "free")+
scale_x_continuous(breaks = 1:12)

ok = foreach(country = unique(sex_pop_ts_sum$country_area)) %do% {
cat(country, "\n")
sub = sex_pop_ts_sum[which((sex_pop_ts_sum$country_area == country) &
(sex_pop_ts_sum$latest_birth_control == "condoms") &
(sex_pop_ts_sum$date %in% seq(as.Date("2017-07-01"),as.Date("2019-06-30"),by = 1))),]
t = seq(as.Date("2017-07-01"),as.Date("2019-06-30"),by = 1)
x = sub$norm_n[match(t, sub$date)]
x = na_interpolation(x)
plot(sub$date, sub$norm_n, type = "l")
plot(t, x, type = "l")
tx = data.frame(t = 1:length(t), x = x)
trend_model = loess(x ~ t, tx, degree = 2, span = 1)
y = predict(trend_model, data.frame(t = tx$t))
plot(tx$t, x, type = "l")
points(tx$t, y, type = "l", col = "red")
tx$trend = y
tx$x_no_trend = tx$x - tx$trend
signal_ts = ts(tx$x_no_trend, frequency = 7)
stl_w = stl(signal_ts, s.window = "periodic")
plot(stl_w)
tx$weekly = stl_w$time.series[,1]
tx$x_no_trend_no_weekly = tx$x_no_trend - tx$weekly
o = order(tx$x_no_trend_no_weekly, decreasing = TRUE)
cat(as.character(sort(t[o[1:20]])),"\n")
plot(t, tx$x_no_trend_no_weekly, type = "l")
points(t[o[1:20]], tx$x_no_trend_no_weekly[o[1:20]], col = "red")
}
## Australia




## 2017-07-15 2017-08-04 2017-09-06 2017-10-28 2018-01-11 2018-01-22 2018-01-29 2018-02-26 2018-04-17 2018-05-01 2018-06-09 2018-09-30 2018-10-01 2018-10-21 2018-11-18 2019-01-12 2019-01-28 2019-03-10 2019-04-04 2019-04-25

## Brazil - Central-West




## 2017-07-07 2017-07-19 2017-09-02 2017-10-12 2018-01-01 2018-02-12 2018-02-13 2018-04-15 2018-04-29 2018-05-01 2018-05-31 2018-09-07 2018-10-12 2018-11-15 2019-01-01 2019-03-04 2019-03-31 2019-04-13 2019-05-01 2019-06-01

## Brazil - Northeast




## 2017-07-15 2017-07-25 2017-07-27 2017-09-04 2017-09-07 2017-09-15 2017-09-25 2017-10-07 2017-11-08 2017-11-14 2017-11-17 2017-11-24 2018-01-13 2018-01-21 2018-06-24 2018-07-22 2018-08-03 2018-09-21 2018-10-01 2019-06-12

## Canada




## 2017-07-16 2017-07-18 2017-08-05 2017-08-20 2017-10-04 2017-10-09 2017-10-15 2017-11-08 2018-01-23 2018-04-28 2018-06-02 2018-09-03 2018-09-09 2018-10-17 2018-10-19 2019-01-01 2019-02-02 2019-02-19 2019-03-31 2019-05-20

## France




## 2017-07-07 2017-07-30 2017-08-01 2017-08-14 2017-08-16 2017-08-21 2017-09-16 2017-09-17 2017-10-22 2018-01-01 2018-01-20 2018-04-07 2018-04-28 2018-05-01 2018-05-08 2018-05-20 2018-10-29 2019-01-01 2019-05-01 2019-05-30

## Germany




## 2017-07-27 2017-08-28 2017-10-03 2017-12-26 2017-12-27 2018-01-01 2018-03-10 2018-04-02 2018-04-07 2018-05-01 2018-05-10 2018-05-21 2018-10-03 2018-12-31 2019-01-01 2019-03-02 2019-04-28 2019-05-01 2019-05-30 2019-06-10

## Mexico




## 2017-07-05 2017-07-17 2018-01-01 2018-01-09 2018-01-19 2018-02-04 2018-02-14 2018-02-17 2018-02-23 2018-04-02 2018-04-21 2018-04-30 2018-05-01 2018-09-01 2018-09-22 2019-01-01 2019-02-04 2019-02-14 2019-03-17 2019-03-18

## United Kingdom




## 2017-07-03 2017-07-24 2017-10-14 2017-10-22 2017-12-27 2017-12-28 2018-01-01 2018-01-02 2018-01-14 2018-02-14 2018-03-17 2018-04-10 2018-05-07 2018-12-31 2019-01-01 2019-01-27 2019-02-14 2019-03-09 2019-05-06 2019-05-27

## United States - California




## 2017-07-02 2017-07-17 2017-07-24 2017-08-05 2017-08-08 2017-09-05 2017-09-28 2017-11-17 2017-11-22 2017-12-15 2018-01-01 2018-01-17 2018-03-24 2018-04-29 2018-06-09 2018-06-20 2018-07-23 2018-08-12 2019-01-01 2019-01-18

## United States - Midwest




## 2017-07-17 2017-07-22 2017-08-09 2017-08-14 2017-09-06 2017-09-11 2017-09-16 2017-09-24 2017-10-27 2017-11-11 2018-06-16 2018-06-26 2018-07-04 2018-07-30 2018-08-20 2018-11-24 2018-12-21 2019-01-01 2019-05-09 2019-06-02
